# Load the necessary package (if not already installed)
#install.packages("corrplot")
#install.packages("skimr")
#install.packages("tidyverse")
#install.packages("Hmisc")
#install.packages("readr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("tidyr")
#install.packages("scales")
library(corrplot)
## corrplot 0.92 loaded
library(skimr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Set the URL
url <- "https://raw.githubusercontent.com/kwartler/Hult_Intro2R/main/A1_CerealEDA/cereals.csv"
# Read the CSV file and create a dataframe
data <- read_csv(url)
## Rows: 185 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): cerealName, parsedName, brand, dietLabels, healthLabels, rawGPTRan...
## dbl (59): calories, Energy_kcal, Total.lipid..fat._g, Fatty.acids..total.sat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Drop duplicates if incase any from the 'data' dataframe
data <- data[!duplicated(data), ]
# Drop the unecessary 'parsedName' column
data <- data[, !colnames(data) %in% "parsedName"]
# Print the dataframe
print(data)
## # A tibble: 185 × 65
## cerealName brand dietLabels healthLabels calories Energy_kcal
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 100% Bran Cereal nabisco HIGH_FIBE… LOW_FAT_ABS… 161 161.
## 2 All Bran Bran Buds kellog HIGH_FIBE… LOW_FAT_ABS… 231 232.
## 3 Almond Oatmeal Crisp general… LOW_FAT, … FAT_FREE, L… 201 201.
## 4 Apple Cinnamon Cheerios general… LOW_SODIUM LOW_FAT_ABS… 105 105.
## 5 Banana Nut Crunch post LOW_CARB,… SUGAR_CONSC… 813 814.
## 6 Barley, Wheat quaker HIGH_FIBE… LOW_FAT_ABS… 556 556.
## 7 Blueberry Morning, Post post LOW_FAT, … FAT_FREE, L… 84 84.4
## 8 Bran Cereal nabisco HIGH_FIBE… LOW_FAT_ABS… 161 161.
## 9 Bran Flakes kellog HIGH_FIBE… LOW_FAT_ABS… 161 161.
## 10 Bran, Raisin quaker LOW_FAT, … FAT_FREE, L… 433 434.
## # ℹ 175 more rows
## # ℹ 59 more variables: Total.lipid..fat._g <dbl>,
## # Fatty.acids..total.saturated_g <dbl>, Fatty.acids..total.trans_g <dbl>,
## # Fatty.acids..total.monounsaturated_g <dbl>,
## # Fatty.acids..total.polyunsaturated_g <dbl>,
## # Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## # Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>, …
########################################
# #
# BASIC EDA #
# #
########################################
########################################
# #
# HISTOGRAM ANALYSIS #
# #
########################################
# Iterate over each column in the dataframe
for (col in names(data)) {
# Check if the column is numeric
if (is.numeric(data[[col]])) {
# Generate histogram for numeric columns
hist_plot <- ggplot(data, aes(x = !!sym(col))) +
geom_histogram(aes(fill = ..count..), binwidth = 25, color = "black") +
labs(x = col, y = "Count", title = paste("Histogram of", col)) +
scale_fill_gradient(low = "blue", high = "red") + # Make bars colorful
theme_minimal() # Simple white background
# Print the histogram plot
print(hist_plot)
}
}
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.




## Warning: Removed 64 rows containing non-finite values (`stat_bin()`).





## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).










## Warning: Removed 5 rows containing non-finite values (`stat_bin()`).






## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).


## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite values (`stat_bin()`).






## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).










## Warning: Removed 5 rows containing non-finite values (`stat_bin()`).






## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite values (`stat_bin()`).

########################################
# #
# GROUPBY ANALYSIS #
# #
########################################
# Group by brand and calculate the mean of calories and energy
group_by_brand <- data %>%
group_by(brand) %>%
summarise(mean_calories = mean(calories, na.rm = TRUE),
mean_energy = mean(Energy_kcal, na.rm = TRUE))
print(group_by_brand)
## # A tibble: 5 × 3
## brand mean_calories mean_energy
## <chr> <dbl> <dbl>
## 1 generalMills 300. 300.
## 2 kellog 246. 247.
## 3 nabisco 251. 251.
## 4 post 426. 426.
## 5 quaker 478. 479.
ggplot(group_by_brand, aes(x = brand, y = mean_calories, fill = brand)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Calories per Brand",
x = "Brand",
y = "Average Calories") +
theme(legend.position = "none")

# Group by brand, dietLabels and calculate the mean of total fat and cholesterol
group_by_brand_diet <- data %>%
group_by(brand, dietLabels) %>%
summarise(mean_fat = mean(Total.lipid..fat._g, na.rm = TRUE),
mean_cholesterol = mean(Cholesterol_mg, na.rm = TRUE))
## `summarise()` has grouped output by 'brand'. You can override using the
## `.groups` argument.
ggplot(group_by_brand_diet, aes(x = brand, y = mean_fat, fill = dietLabels)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Average Fat per Brand by Diet Labels",
x = "Brand",
y = "Average Fat")

# Start by counting the occurrences of each 'dietLabels' value in the 'data' dataframe
dietLabels_counts <- data %>%
dplyr::group_by(dietLabels) %>%
dplyr::summarise(n = dplyr::n())
# Get the unique values of 'dietLabels'
unique_labels <- unique(dietLabels_counts$dietLabels)
# Define custom colors
my_colors <- c("#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", "#00FFFF", "#FFA500", "#008000", "#800080")
# Create a bar chart using the ggplot2 library and specify the data and aesthetics
ggplot2::ggplot(dietLabels_counts, ggplot2::aes(x = reorder(dietLabels, n), y = n, fill = dietLabels)) +
# 'geom_bar' creates a bar chart with bars filled according to 'dietLabels'
ggplot2::geom_bar(stat = "identity") +
# 'labs' function is used to add labels to the x-axis, y-axis, and the chart title
ggplot2::labs(x = "Diet Labels", y = "Frequency", title = "Frequency Count of Diet Labels") +
# 'theme' function is used to customize the appearance of the chart, here we're adjusting the angle and justification of x-axis labels
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
# 'scale_fill_manual' function allows us to manually set the fill colors for the bars
ggplot2::scale_fill_manual(values = my_colors[1:length(unique_labels)])

########################################
# #
# HEALTH ANALYSIS #
# #
########################################
# This line takes the healthLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_labels <- unlist(strsplit(data$healthLabels, ", "))
# Here, the unique() function is used to extract unique values from the all_labels vector. These unique labels are stored in the unique_labels variable.
unique_labels <- unique(all_labels)
#print(unique_labels)
# This loop iterates over each unique label in unique_labels and prints it using the cat() function. Each label is printed on a new line.
#for (label in unique_labels) {
# cat(label, "\n")
#}
# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each label exists in the healthLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_labels, function(label) grepl(label, data$healthLabels))
# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)
# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)
# Here, a character vector column_names is created, containing the names of specific health labels. These names correspond to the encoded columns in the data dataset.
column_names <- c("LOW_FAT_ABS",
"VEGAN",
"VEGETARIAN",
"PESCATARIAN",
"MEDITERRANEAN",
"DAIRY_FREE",
"EGG_FREE",
"MILK_FREE",
"PEANUT_FREE",
"TREE_NUT_FREE",
"SOY_FREE",
"FISH_FREE",
"SHELLFISH_FREE",
"PORK_FREE",
"RED_MEAT_FREE",
"CRUSTACEAN_FREE",
"CELERY_FREE",
"MUSTARD_FREE",
"SESAME_FREE",
"LUPINE_FREE",
"MOLLUSK_FREE",
"ALCOHOL_FREE",
"NO_OIL_ADDED",
"SULPHITE_FREE",
"KOSHER",
"FAT_FREE",
"SUGAR_CONSCIOUS",
"LOW_POTASSIUM",
"KIDNEY_FRIENDLY",
"WHEAT_FREE",
"LOW_SUGAR",
"KETO_FRIENDLY",
"DASH",
"GLUTEN_FREE",
"NO_SUGAR_ADDED",
"FODMAP_FREE",
"PALEO",
"SPECIFIC_CARBS")
#This line calculates the column-wise means for the columns specified in column_names from the data dataset using the colMeans() function. The resulting means are multiplied by 100 and stored in the percentages variable.
percentages <- colMeans(data[column_names]) * 100
#Finally, the percentages variable is printed, displaying the calculated percentages for each health label in the column_names columns of the data dataset.
print(percentages)
## LOW_FAT_ABS VEGAN VEGETARIAN PESCATARIAN MEDITERRANEAN
## 76.216216 89.189189 100.000000 100.000000 57.297297
## DAIRY_FREE EGG_FREE MILK_FREE PEANUT_FREE TREE_NUT_FREE
## 97.297297 98.918919 98.378378 92.972973 82.702703
## SOY_FREE FISH_FREE SHELLFISH_FREE PORK_FREE RED_MEAT_FREE
## 97.837838 100.000000 100.000000 100.000000 100.000000
## CRUSTACEAN_FREE CELERY_FREE MUSTARD_FREE SESAME_FREE LUPINE_FREE
## 100.000000 100.000000 100.000000 100.000000 100.000000
## MOLLUSK_FREE ALCOHOL_FREE NO_OIL_ADDED SULPHITE_FREE KOSHER
## 100.000000 100.000000 76.216216 92.432432 100.000000
## FAT_FREE SUGAR_CONSCIOUS LOW_POTASSIUM KIDNEY_FRIENDLY WHEAT_FREE
## 27.567568 41.621622 58.378378 81.621622 43.243243
## LOW_SUGAR KETO_FRIENDLY DASH GLUTEN_FREE NO_SUGAR_ADDED
## 9.189189 11.351351 32.972973 40.540541 25.945946
## FODMAP_FREE PALEO SPECIFIC_CARBS
## 18.378378 13.513514 12.972973
# Specify the logical columns of interest
logical_columns <- c("LOW_FAT_ABS", "VEGAN", "VEGETARIAN", "PESCATARIAN", "MEDITERRANEAN", "DAIRY_FREE",
"EGG_FREE", "MILK_FREE", "PEANUT_FREE", "TREE_NUT_FREE", "SOY_FREE", "FISH_FREE",
"SHELLFISH_FREE", "PORK_FREE", "RED_MEAT_FREE", "CRUSTACEAN_FREE", "CELERY_FREE",
"MUSTARD_FREE", "SESAME_FREE", "LUPINE_FREE", "MOLLUSK_FREE", "ALCOHOL_FREE",
"NO_OIL_ADDED", "SULPHITE_FREE", "KOSHER", "FAT_FREE", "SUGAR_CONSCIOUS", "LOW_POTASSIUM",
"KIDNEY_FRIENDLY", "WHEAT_FREE", "LOW_SUGAR", "KETO_FRIENDLY", "DASH", "GLUTEN_FREE",
"NO_SUGAR_ADDED", "FODMAP_FREE", "PALEO", "SPECIFIC_CARBS")
# Loop through each logical column
for(col in logical_columns){
# Subset the data for only the brand and the current logical column
data_logical <- data[, c("brand", col)]
# Reshape the data to long format for plotting
data_logical_long <- data_logical %>%
tidyr::gather(key = "column", value = "logical_value", -brand)
# Create the bar plot
plot_logical <- ggplot(data_logical_long, aes(x = brand, fill = logical_value)) +
geom_bar(position = "fill") +
labs(x = "Brand", y = "Proportion", title = paste("Proportion of", col, "by Brand")) +
facet_wrap(~column, scales = "free_x", nrow = 1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the bar plot
print(plot_logical)
}






































########################################
# #
# DIET ANALYSIS #
# #
########################################
# This line takes the dietLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_dietLabels <- unlist(strsplit(data$dietLabels, ", "))
# Here, the unique() function is used to extract unique values from the all_dietLabels vector. These unique dietLabels are stored in the unique_dietLabels variable.
unique_dietLabels <- unique(all_dietLabels)
#print(unique_dietLabels)
# This loop iterates over each unique label in unique_dietLabels and prints it using the cat() function. Each label is printed on a new line.
#for (label in unique_dietLabels) {
# cat(label, "\n")
#}
# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each dietLabel exists in the dietLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_dietLabels, function(label) grepl(label, data$dietLabels))
# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)
# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)
# Here, a character vector column_names is created, containing the names of specific health labels. These names correspond to the encoded columns in the data dataset.
column_names <- c("HIGH_FIBER",
"LOW_SODIUM",
"LOW_FAT",
"LOW_CARB",
"BALANCED")
#This line calculates the column-wise means for the columns specified in column_names from the data dataset using the colMeans() function. The resulting means are multiplied by 100 and stored in the percentages variable.
percentages <- colMeans(data[column_names]) * 100
#Finally, the percentages variable is printed, displaying the calculated percentages for each health label in the column_names columns of the data dataset.
print(percentages)
## HIGH_FIBER LOW_SODIUM LOW_FAT LOW_CARB BALANCED
## 17.837838 92.432432 63.243243 15.135135 3.243243
# Select the relevant columns for analysis
columns <- c("brand", "HIGH_FIBER", "LOW_SODIUM", "LOW_FAT", "LOW_CARB", "BALANCED")
data_logical <- data[, columns]
# Reshape the data to long format for plotting
data_logical_long <- data_logical %>%
tidyr::gather(key = "column", value = "logical_value", -brand)
# Create the bar plot
plot_logical <- ggplot(data_logical_long, aes(x = brand, fill = logical_value)) +
geom_bar(position = "fill") +
labs(x = "Brand", y = "Proportion", title = "Proportion of Logical Values by Brand") +
facet_wrap(~column, scales = "free_x") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the bar plot
print(plot_logical)

########################################
# #
# NUTRIENT ANALYSIS #
# #
########################################
# Set the URL
url <- "https://raw.githubusercontent.com/kwartler/Hult_Intro2R/main/A1_CerealEDA/cereals.csv"
# Read the CSV file and create a dataframe
data <- read_csv(url)
## Rows: 185 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): cerealName, parsedName, brand, dietLabels, healthLabels, rawGPTRan...
## dbl (59): calories, Energy_kcal, Total.lipid..fat._g, Fatty.acids..total.sat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Generate summary table for the 'data' dataframe
summary_table <- skim(data)
# Print the summary table
print(summary_table)
## ── Data Summary ────────────────────────
## Values
## Name data
## Number of rows 185
## Number of columns 66
## _______________________
## Column type frequency:
## character 7
## numeric 59
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 cerealName 0 1 3 45 0 185 0
## 2 parsedName 0 1 8 8 0 1 0
## 3 brand 0 1 4 12 0 5 0
## 4 dietLabels 2 0.989 7 32 0 9 0
## 5 healthLabels 0 1 294 447 0 60 0
## 6 rawGPTRank 0 1 1 264 0 39 0
## 7 gptDescription 0 1 105 312 0 185 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd
## 1 calories 0 1 340. 306.
## 2 Energy_kcal 0 1 340. 306.
## 3 Total.lipid..fat._g 0 1 13.3 28.9
## 4 Fatty.acids..total.saturated_g 0 1 2.51 9.36
## 5 Fatty.acids..total.trans_g 64 0.654 0.0790 0.678
## 6 Fatty.acids..total.monounsaturated_g 0 1 6.27 14.8
## 7 Fatty.acids..total.polyunsaturated_g 0 1 3.69 7.66
## 8 Carbohydrate..by.difference_g 0 1 54.3 45.8
## 9 Carbohydrates..net._g 0 1 44.8 43.7
## 10 Fiber..total.dietary_g 2 0.989 9.57 11.8
## 11 Sugars..total_g 17 0.908 18.0 41.6
## 12 Protein_g 0 1 9.08 10.0
## 13 Cholesterol_mg 0 1 2.64 35.9
## 14 Sodium..Na_mg 0 1 119. 125.
## 15 Calcium..Ca_mg 0 1 98.8 185.
## 16 Magnesium..Mg_mg 0 1 110. 119.
## 17 Potassium..K_mg 0 1 402. 367.
## 18 Iron..Fe_mg 0 1 6.13 4.63
## 19 Zinc..Zn_mg 0 1 3.51 2.69
## 20 Phosphorus..P_mg 0 1 284. 279.
## 21 Vitamin.A..RAE_µg 5 0.973 118. 168.
## 22 Vitamin.C..total.ascorbic.acid_mg 0 1 4.13 5.15
## 23 Thiamin_mg 0 1 0.496 0.408
## 24 Riboflavin_mg 0 1 0.404 0.387
## 25 Niacin_mg 0 1 5.52 4.94
## 26 Vitamin.B.6_mg 0 1 0.816 1.42
## 27 Folate..DFE_µg 3 0.984 238. 341.
## 28 Folate..food_µg 0 1 32.3 52.5
## 29 Folic.acid_µg 3 0.984 120. 205.
## 30 Vitamin.B.12_µg 3 0.984 1.48 2.94
## 31 Vitamin.D..D2...D3._µg 3 0.984 0.712 0.916
## 32 Vitamin.E..alpha.tocopherol._mg 17 0.908 2.23 6.12
## 33 Vitamin.K..phylloquinone._µg 26 0.859 3.26 7.02
## 34 Water_g 0 1 10.4 25.7
## 35 Energy_pct 0 1 17.0 15.3
## 36 Fat_pct 0 1 20.5 44.5
## 37 Saturated_pct 0 1 12.6 46.8
## 38 Carbs_pct 0 1 18.1 15.3
## 39 Fiber_pct 2 0.989 38.3 47.3
## 40 Protein_pct 0 1 18.2 20.0
## 41 Cholesterol_pct 0 1 0.879 12.0
## 42 Sodium_pct 0 1 4.97 5.21
## 43 Calcium_pct 0 1 9.89 18.5
## 44 Magnesium_pct 0 1 26.2 28.3
## 45 Potassium_pct 0 1 8.55 7.81
## 46 Iron_pct 0 1 34.1 25.7
## 47 Zinc_pct 0 1 32.0 24.5
## 48 Phosphorus_pct 0 1 40.5 39.9
## 49 Vitamin.A_pct 5 0.973 13.2 18.7
## 50 Vitamin.C_pct 0 1 4.59 5.72
## 51 Thiamin..B1._pct 0 1 41.5 34.0
## 52 Riboflavin..B2._pct 0 1 31.0 29.7
## 53 Niacin..B3._pct 0 1 34.5 30.9
## 54 Vitamin.B6_pct 0 1 62.7 109.
## 55 Folate.equivalent..total._pct 3 0.984 59.4 85.4
## 56 Vitamin.B12_pct 3 0.984 61.6 123.
## 57 Vitamin.D_pct 3 0.984 4.75 6.11
## 58 Vitamin.E_pct 17 0.908 14.9 40.8
## 59 Vitamin.K_pct 26 0.859 2.72 5.85
## p0 p25 p50 p75 p100 hist
## 1 26 141 196 433 1627 ▇▂▁▁▁
## 2 26.2 141. 196. 434. 1628. ▇▂▁▁▁
## 3 0 0.83 1.81 6.61 184. ▇▁▁▁▁
## 4 0 0.18 0.4 1.25 117. ▇▁▁▁▁
## 5 0 0 0 0.01 7.44 ▇▁▁▁▁
## 6 0 0.15 0.41 1.92 66.9 ▇▁▁▁▁
## 7 0 0.31 0.64 1.84 56.6 ▇▁▁▁▁
## 8 0.14 29.1 40.7 59.6 279. ▇▂▁▁▁
## 9 0.14 22.4 33.8 44.7 279. ▇▁▁▁▁
## 10 0 2.58 6.31 12.3 66.3 ▇▂▁▁▁
## 11 0.1 1.49 6.22 16.3 278. ▇▁▁▁▁
## 12 0.17 2.99 4.98 12.6 57.3 ▇▂▁▁▁
## 13 0 0 0 0 488. ▇▁▁▁▁
## 14 0 13.6 116. 210. 617. ▇▆▁▁▁
## 15 1.2 25.4 56.8 110. 1250. ▇▁▁▁▁
## 16 3.2 26.6 65.2 212. 456. ▇▁▂▁▁
## 17 17.6 122 211. 659. 1499. ▇▂▂▁▁
## 18 0.05 2.5 5.39 8.47 32.5 ▇▅▁▁▁
## 19 0.04 1.49 3.02 5 14.1 ▇▇▂▁▁
## 20 5.8 80.0 163. 556. 1314 ▇▁▂▁▁
## 21 0 0 8.75 228. 1553. ▇▁▁▁▁
## 22 0 0 1.46 7.99 21.1 ▇▃▁▁▁
## 23 0 0.22 0.43 0.65 2.16 ▇▆▂▁▁
## 24 0 0.13 0.27 0.58 1.68 ▇▆▁▁▁
## 25 0.04 1.88 5.17 7.01 33.8 ▇▃▁▁▁
## 26 0.01 0.19 0.5 0.68 7.44 ▇▁▁▁▁
## 27 1.45 29.4 158. 318. 2015. ▇▁▁▁▁
## 28 0.5 7.09 12.1 27.5 323. ▇▁▁▁▁
## 29 0 0 13.8 156. 1164. ▇▁▁▁▁
## 30 0 0 0 1.91 18.0 ▇▁▁▁▁
## 31 0 0 0 1.32 3.4 ▇▃▁▂▁
## 32 0 0.17 0.34 0.95 36.6 ▇▁▁▁▁
## 33 0 0.48 1.01 3.22 38.9 ▇▁▁▁▁
## 34 0.16 1.34 2.58 6.31 224. ▇▁▁▁▁
## 35 1.31 7.06 9.8 21.7 81.4 ▇▂▁▁▁
## 36 0 1.28 2.78 10.2 283. ▇▁▁▁▁
## 37 0 0.9 1.98 6.24 583. ▇▁▁▁▁
## 38 0.05 9.71 13.6 19.9 93.1 ▇▂▁▁▁
## 39 0 10.3 25.2 49.3 265. ▇▂▁▁▁
## 40 0.35 5.98 9.96 25.3 115. ▇▂▁▁▁
## 41 0 0 0 0 163. ▇▁▁▁▁
## 42 0 0.56 4.85 8.75 25.7 ▇▆▁▁▁
## 43 0.12 2.54 5.68 11.0 125. ▇▁▁▁▁
## 44 0.76 6.34 15.5 50.4 109. ▇▁▂▁▁
## 45 0.38 2.6 4.48 14.0 31.9 ▇▂▂▁▁
## 46 0.25 13.9 30.0 47.0 180. ▇▅▁▁▁
## 47 0.4 13.6 27.4 45.5 128. ▇▇▂▁▁
## 48 0.83 11.4 23.3 79.5 188. ▇▁▂▁▁
## 49 0 0 0.97 25.3 173. ▇▁▁▁▁
## 50 0 0 1.63 8.88 23.4 ▇▃▁▁▁
## 51 0 18.6 36 54.4 180. ▇▆▂▁▁
## 52 0 9.66 21.1 44.5 129. ▇▆▁▁▁
## 53 0.28 11.7 32.3 43.8 211. ▇▃▁▁▁
## 54 0.52 14.3 38.4 52.6 572. ▇▁▁▁▁
## 55 0.36 7.35 39.5 79.4 504. ▇▁▁▁▁
## 56 0 0 0 79.5 751. ▇▁▁▁▁
## 57 0 0 0 8.82 22.7 ▇▃▁▂▁
## 58 0 1.16 2.23 6.33 244. ▇▁▁▁▁
## 59 0 0.395 0.84 2.69 32.4 ▇▁▁▁▁
data <- data[, c("cerealName", "brand", "calories", "Energy_kcal", "Total.lipid..fat._g", "Fatty.acids..total.saturated_g", "Fatty.acids..total.trans_g", "Fatty.acids..total.monounsaturated_g", "Fatty.acids..total.polyunsaturated_g", "Carbohydrate..by.difference_g", "Carbohydrates..net._g", "Fiber..total.dietary_g", "Sugars..total_g", "Protein_g", "Cholesterol_mg", "Sodium..Na_mg", "Calcium..Ca_mg", "Magnesium..Mg_mg", "Potassium..K_mg", "Iron..Fe_mg", "Zinc..Zn_mg", "Phosphorus..P_mg", "Vitamin.A..RAE_µg", "Vitamin.C..total.ascorbic.acid_mg", "Thiamin_mg", "Riboflavin_mg", "Niacin_mg", "Vitamin.B.6_mg", "Folate..DFE_µg", "Folate..food_µg", "Folic.acid_µg", "Vitamin.B.12_µg", "Vitamin.D..D2...D3._µg", "Vitamin.E..alpha.tocopherol._mg", "Vitamin.K..phylloquinone._µg", "Water_g", "Energy_pct", "Fat_pct", "Saturated_pct", "Carbs_pct", "Fiber_pct", "Protein_pct", "Cholesterol_pct", "Sodium_pct", "Calcium_pct", "Magnesium_pct", "Potassium_pct", "Iron_pct", "Zinc_pct", "Phosphorus_pct", "Vitamin.A_pct", "Vitamin.C_pct", "Thiamin..B1._pct", "Riboflavin..B2._pct", "Niacin..B3._pct", "Vitamin.B6_pct", "Folate.equivalent..total._pct", "Vitamin.B12_pct", "Vitamin.D_pct", "Vitamin.E_pct", "Vitamin.K_pct")]
########################################
# #
# NUTRIENT: ENERGY CONTENT ANALYSIS #
# #
########################################
# Top and bottom 5 cereals based on energy content
top_5_energy <- data %>%
arrange(desc(Energy_kcal)) %>%
head(5)
bottom_5_energy <- data %>%
arrange(Energy_kcal) %>%
head(5)
print(top_5_energy)
## # A tibble: 5 × 61
## cerealName brand calories Energy_kcal Total.lipid..fat._g
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Puffs, Peenut Butter gene… 1627 1628. 184.
## 2 General Mills Reese's Peanut B… gene… 1542 1543. 133.
## 3 Peanut Butter Puffs - Reese's … gene… 1542 1543. 133.
## 4 Reese's Peanut Butter Puffs gene… 1542 1543. 133.
## 5 Post Wheat and Barley post 1207 1207. 6.85
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## # Fatty.acids..total.trans_g <dbl>,
## # Fatty.acids..total.monounsaturated_g <dbl>,
## # Fatty.acids..total.polyunsaturated_g <dbl>,
## # Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## # Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>,
## # Cholesterol_mg <dbl>, Sodium..Na_mg <dbl>, Calcium..Ca_mg <dbl>, …
print(bottom_5_energy)
## # A tibble: 5 × 61
## cerealName brand calories Energy_kcal Total.lipid..fat._g
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Post Fruit & Fiber Dates, Rais… post 26 26.2 2.61
## 2 Pebbles, Fruit nabi… 73 73.5 0.17
## 3 Blueberry Morning, Post post 84 84.4 0.49
## 4 Post Blueberry Morning post 84 84.4 0.49
## 5 General Mills Kix gene… 85 85.7 0.83
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## # Fatty.acids..total.trans_g <dbl>,
## # Fatty.acids..total.monounsaturated_g <dbl>,
## # Fatty.acids..total.polyunsaturated_g <dbl>,
## # Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## # Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>,
## # Cholesterol_mg <dbl>, Sodium..Na_mg <dbl>, Calcium..Ca_mg <dbl>, …
# Histogram for energy content
ggplot(data, aes(x = Energy_kcal)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
xlab("Energy content (kcal)") +
ylab("Frequency") +
ggtitle("Distribution of energy content in ALL Cereals")

##################################################
# #
# NUTRIENT:Sugar ANALYSIS #
# #
##################################################
# Calculate average sugar level
avg_sugar <- data %>%
summarise(avg_sugar = mean(Sugars..total_g, na.rm = TRUE))
# Cereals with high sugar content
high_sugar_cereals <- data %>%
filter(Sugars..total_g > avg_sugar$avg_sugar)
print(avg_sugar)
## # A tibble: 1 × 1
## avg_sugar
## <dbl>
## 1 18.0
print(high_sugar_cereals)
## # A tibble: 41 × 61
## cerealName brand calories Energy_kcal Total.lipid..fat._g
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 All Bran Bran Buds kell… 231 232. 2.25
## 2 Bran, Raisin quak… 433 434. 0.67
## 3 Corn Flakes, Honey Crunch kell… 1030 1031. 0
## 4 Cracklin' Oat Bran kell… 258 258. 9.21
## 5 Crispy Wheat with Raisins quak… 433 434. 0.67
## 6 General Mills Raisin Nut Bran gene… 240 240. 3.46
## 7 General Mills Reese's Peanut … gene… 1542 1543. 133.
## 8 General Mills Total Raisin Br… gene… 187 188. 1.6
## 9 General Mills Wheaties Raisin… gene… 187 188. 1.6
## 10 Grahams, Golden post 361 361. 8.9
## # ℹ 31 more rows
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## # Fatty.acids..total.trans_g <dbl>,
## # Fatty.acids..total.monounsaturated_g <dbl>,
## # Fatty.acids..total.polyunsaturated_g <dbl>,
## # Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## # Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>, …
##################################################
# #
# NUTRIENT:FIBER ANALYSIS #
# #
##################################################
# Calculate average fiber content
avg_fiber <- data %>%
summarise(avg_fiber = mean(Fiber..total.dietary_g, na.rm = TRUE))
# Cereals with high fiber content
high_fiber_cereals <- data %>%
filter(Fiber..total.dietary_g > avg_fiber$avg_fiber)
print(avg_fiber)
## # A tibble: 1 × 1
## avg_fiber
## <dbl>
## 1 9.57
print(high_fiber_cereals)
## # A tibble: 56 × 61
## cerealName brand calories Energy_kcal Total.lipid..fat._g
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 100% Bran Cereal nabisco 161 161. 3.04
## 2 All Bran Bran Buds kellog 231 232. 2.25
## 3 Banana Nut Crunch post 813 814. 70.5
## 4 Barley, Wheat quaker 556 556. 2.62
## 5 Bran Cereal nabisco 161 161. 3.04
## 6 Bran Flakes kellog 161 161. 3.04
## 7 Cheerios-Apple Cinnamon generalMills 308 308. 1.55
## 8 Cheerios-Honey Nut generalMills 813 814. 70.5
## 9 Chex, Honey Nut generalMills 813 814. 70.5
## 10 Chex, Wheat generalMills 556 556. 2.62
## # ℹ 46 more rows
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## # Fatty.acids..total.trans_g <dbl>,
## # Fatty.acids..total.monounsaturated_g <dbl>,
## # Fatty.acids..total.polyunsaturated_g <dbl>,
## # Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## # Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>, …
##################################################
# #
# NUTRIENT:CORRELATION fat_pct,cholesterol_mg #
# #
##################################################
# Calculate correlation between fat_pct and cholesterol_mg
correlation <- cor(data$Fat_pct, data$Cholesterol_mg, use = "complete.obs")
print(correlation)
## [1] 0.4365212
##################################################
# #
# NUTRIENT:BRAND fat_pct,cholesterol_mg #
# #
##################################################
# Mean nutrient values by brand
nutrient_by_brand <- data %>%
group_by(brand) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `brand = "generalMills"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
print(nutrient_by_brand)
## # A tibble: 5 × 60
## brand calories Energy_kcal Total.lipid..fat._g Fatty.acids..total.sat…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 generalMills 300. 300. 14.2 3.70
## 2 kellog 246. 247. 4.24 1.20
## 3 nabisco 251. 251. 3.56 1.08
## 4 post 426. 426. 18.8 2.05
## 5 quaker 478. 479. 19.2 2.60
## # ℹ abbreviated name: ¹Fatty.acids..total.saturated_g
## # ℹ 55 more variables: Fatty.acids..total.trans_g <dbl>,
## # Fatty.acids..total.monounsaturated_g <dbl>,
## # Fatty.acids..total.polyunsaturated_g <dbl>,
## # Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## # Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>,
## # Cholesterol_mg <dbl>, Sodium..Na_mg <dbl>, Calcium..Ca_mg <dbl>, …
ggplot(data, aes(x = Fiber..total.dietary_g)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
xlab("Fiber content (g)") +
ylab("Frequency") +
ggtitle("Distribution of fiber content in cereals")
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

ggplot(data, aes(x = Fat_pct, y = Cholesterol_pct)) +
geom_point() +
xlab("Fat Percentage") +
ylab("Cholesterol (mg)") +
ggtitle("Scatter plot of Fat Percentage vs Cholesterol")

##################################################
# #
# CORRELATION ANALYISIS #
# #
##################################################
# Select only numeric columns from the dataframe
numeric_data <- data[, sapply(data, is.numeric)]
# Compute the correlation matrix
correlation_matrix <- cor(numeric_data, use = "pairwise.complete.obs")
# Increase the size of the correlation plot
corrplot(correlation_matrix, method = "color", tl.cex = 0.8, cl.cex = 0.8, mar = c(1, 1, 3, 4))

##################################################
# #
# REGRESSION ANALYISIS #
# #
##################################################
##################################################
# #
# CLEANING #
# #
##################################################
# Set the URL
url <- "https://raw.githubusercontent.com/kwartler/Hult_Intro2R/main/A1_CerealEDA/cereals.csv"
# Read the CSV file and create a dataframe
data <- read_csv(url)
## Rows: 185 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): cerealName, parsedName, brand, dietLabels, healthLabels, rawGPTRan...
## dbl (59): calories, Energy_kcal, Total.lipid..fat._g, Fatty.acids..total.sat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Drop rows with more than 2 NA values in the row
data <- data[rowSums(is.na(data)) <= 2,]
# Perform mean imputation on remaining NA values
data_imputed <- data %>%
mutate(across(where(is.numeric), ~impute(.x, mean)))
##################################################
# #
# ONE HOT ENCODING #
# #
##################################################
########################################
# #
# HEALTH #
# #
########################################
# This line takes the healthLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_labels <- unlist(strsplit(data$healthLabels, ", "))
# Here, the unique() function is used to extract unique values from the all_labels vector. These unique labels are stored in the unique_labels variable.
unique_labels <- unique(all_labels)
# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each label exists in the healthLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_labels, function(label) grepl(label, data$healthLabels))
# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)
# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)
########################################
# #
# DIET #
# #
########################################
# This line takes the dietLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_dietLabels <- unlist(strsplit(data$dietLabels, ", "))
# Here, the unique() function is used to extract unique values from the all_dietLabels vector. These unique dietLabels are stored in the unique_dietLabels variable.
unique_dietLabels <- unique(all_dietLabels)
# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each dietLabel exists in the dietLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_dietLabels, function(label) grepl(label, data$dietLabels))
# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)
# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)
# First convert the 'rawGPTRank' column to character
data$rawGPTRank <- as.character(data$rawGPTRank)
# Identify rows that contain only numeric characters or decimal points in the 'gptDescription' column
numeric_rows <- grepl("^[0-9.]+$", data$rawGPTRank)
# Subset the dataframe to include only these rows
data <- data[numeric_rows, ]
# This script first identifies all numeric columns in your dataframe except for 'rawGPTRank'. It then creates a formula string for the regression model, which includes 'rawGPTRank' as the dependent variable and all other numeric columns as the independent variables. This formula is used to fit a linear regression model using the lm() function. Finally, it prints a summary of the model, which includes the coefficients of the regression, the residuals, and various statistics that help to interpret the model's fit.
# Get the names of all numeric columns except 'rawGPTRank'
numeric_vars <- sapply(data, is.numeric)
numeric_vars["rawGPTRank"] <- FALSE
numeric_cols <- names(numeric_vars)[numeric_vars]
# Create the formula for the regression model
regression_formula <- as.formula(paste("rawGPTRank ~", paste(numeric_cols, collapse = " + ")))
# Run the linear regression model
model <- lm(regression_formula, data = data)
# Print the summary of the model
summary(model)
##
## Call:
## lm(formula = regression_formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6250 0.0000 0.0000 0.3125 1.3750
##
## Coefficients: (21 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.27509 9.50493 3.290 0.00164 **
## calories 6.04930 2.05338 2.946 0.00451 **
## Energy_kcal -126.71458 46.06711 -2.751 0.00776 **
## Total.lipid..fat._g -110.44983 109.93764 -1.005 0.31890
## Fatty.acids..total.saturated_g 1952.29834 685.47689 2.848 0.00593 **
## Fatty.acids..total.trans_g -143.21922 55.39990 -2.585 0.01206 *
## Fatty.acids..total.monounsaturated_g -11.66551 5.51019 -2.117 0.03821 *
## Fatty.acids..total.polyunsaturated_g -11.27790 4.79389 -2.353 0.02178 *
## Carbohydrate..by.difference_g 185.08042 113.13661 1.636 0.10685
## Carbohydrates..net._g -681.53293 248.75193 -2.740 0.00799 **
## Fiber..total.dietary_g -679.80366 247.97234 -2.741 0.00795 **
## Sugars..total_g 1.29942 0.47096 2.759 0.00758 **
## Protein_g 4.53735 1.77904 2.550 0.01320 *
## Cholesterol_mg -2.59453 0.94834 -2.736 0.00807 **
## Sodium..Na_mg 0.02976 0.01116 2.665 0.00975 **
## Calcium..Ca_mg -0.01420 0.01194 -1.190 0.23859
## Magnesium..Mg_mg -0.38838 0.13803 -2.814 0.00653 **
## Potassium..K_mg 0.04488 0.01316 3.410 0.00114 **
## Iron..Fe_mg 1.48292 0.55607 2.667 0.00972 **
## Zinc..Zn_mg 4.90079 1.92445 2.547 0.01333 *
## Phosphorus..P_mg -0.11459 0.04681 -2.448 0.01717 *
## Vitamin.A..RAE_µg -0.04541 0.01591 -2.854 0.00583 **
## Vitamin.C..total.ascorbic.acid_mg -3.44057 1.25582 -2.740 0.00799 **
## Thiamin_mg 4.53000 2.77818 1.631 0.10797
## Riboflavin_mg -7.79950 2.63311 -2.962 0.00431 **
## Niacin_mg -5.11805 1.87923 -2.723 0.00835 **
## Vitamin.B.6_mg 18.15867 7.34980 2.471 0.01621 *
## Folate..DFE_µg 58.51579 20.82501 2.810 0.00659 **
## Folate..food_µg -59.68778 21.25060 -2.809 0.00661 **
## Folic.acid_µg -99.51582 35.41547 -2.810 0.00659 **
## Vitamin.B.12_µg 1.37699 0.49790 2.766 0.00745 **
## Vitamin.D..D2...D3._µg 4.75141 1.70265 2.791 0.00695 **
## Vitamin.E..alpha.tocopherol._mg 6.86078 2.51417 2.729 0.00823 **
## Vitamin.K..phylloquinone._µg -4.10880 1.60858 -2.554 0.01307 *
## Water_g 0.58664 0.23753 2.470 0.01624 *
## Energy_pct 2412.12504 883.37226 2.731 0.00819 **
## Fat_pct 76.33909 72.93711 1.047 0.29926
## Saturated_pct -387.36367 136.10420 -2.846 0.00597 **
## Carbs_pct 1489.78256 461.46497 3.228 0.00198 **
## Fiber_pct NA NA NA NA
## Protein_pct NA NA NA NA
## Cholesterol_pct NA NA NA NA
## Sodium_pct NA NA NA NA
## Calcium_pct NA NA NA NA
## Magnesium_pct NA NA NA NA
## Potassium_pct NA NA NA NA
## Iron_pct NA NA NA NA
## Zinc_pct NA NA NA NA
## Phosphorus_pct NA NA NA NA
## Vitamin.A_pct NA NA NA NA
## Vitamin.C_pct NA NA NA NA
## Thiamin..B1._pct NA NA NA NA
## Riboflavin..B2._pct NA NA NA NA
## Niacin..B3._pct NA NA NA NA
## Vitamin.B6_pct NA NA NA NA
## Folate.equivalent..total._pct NA NA NA NA
## Vitamin.B12_pct NA NA NA NA
## Vitamin.D_pct NA NA NA NA
## Vitamin.E_pct NA NA NA NA
## Vitamin.K_pct NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7068 on 63 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.7511, Adjusted R-squared: 0.601
## F-statistic: 5.004 on 38 and 63 DF, p-value: 9.59e-09
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#$ $
#$ T H A N K $
#$ Y O U $
#$ $
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# Function to print a big "THANK YOU" using "#" characters
print_big_thank_you <- function() {
# Define the letters of "THANK YOU" represented in "#" characters
letters <- list(
"T" = c("#####", " # ", " # ", " # "),
"H" = c("# #", "# #", "#####", "# #"),
"A" = c(" ## ", "# #", "#####", "# #"),
"N" = c("# #", "## #", "# # #", "# ##"),
"K" = c("# #", "# # ", "## ", "# # "),
"Y" = c("# #", " # # ", " # ", " # "),
"O" = c("#####", "# #", "# #", "#####"),
"U" = c("# #", "# #", "# #", "#####"),
" " = c(" ", " ", " ", " ")
)
# Print each row of the big "THANK YOU"
for (row in 1:4) {
for (letter in c("T", "H", "A", "N", "K", " ", "Y", "O", "U")) {
cat(letters[[letter]][row], " ")
}
cat("\n")
}
}
# Call the function to print the big "THANK YOU"
print_big_thank_you()
## ##### # # ## # # # # # # ##### # #
## # # # # # ## # # # # # # # # #
## # ##### ##### # # # ## # # # # #
## # # # # # # ## # # # ##### #####